package cc.mallet.topics.gui;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.HashSet;
import java.util.StringTokenizer;

import cc.mallet.pipe.Pipe;
import cc.mallet.share.weili.ner.WordTransformation;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;

public class EnronEmail2TokenSequence extends Pipe implements Serializable {
	boolean saveSource = false;
	public static String[] skip = new String[] { "=_part_", "sent by:" };
	public static String[] skipToBlankLine = new String[] { "subject:",
			"original message", "content-type:", "content-transfer-encoding:",
			"forwarded by", "from:", "sent:", "to:", "bcc:", "cc:" };
	public static String[] labels = new String[] { "DATE", "TIME", "LOCATION",
			"PERSON", "ORGANIZATION", "ACRONYM", "PHONE", "MONEY", "PERCENT" };
	HashSet headerPersonNames;

	public EnronEmail2TokenSequence() {
		super(null, new LabelAlphabet());
		headerPersonNames = new HashSet();
	}

	public Instance pipe(Instance carrier) {
		TokenSequence data = new TokenSequence();
		LabelSequence target = new LabelSequence(
				(LabelAlphabet) getTargetAlphabet());
		StringBuffer source = saveSource ? new StringBuffer() : null;
		WordTransformation wt = new WordTransformation();

		File f = (File) carrier.getData();
		StringBuffer message = new StringBuffer();
		try {
			BufferedReader br = new BufferedReader(new FileReader(f));
			// skip the header before the first blank line
			String line = br.readLine();
			while (line != null) {
				if (line.equals(""))
					break;
				int i;
				line = line.toLowerCase();
				for (i = 5; i <= 9; i++) {
					if (line.startsWith(skipToBlankLine[i]))
						break;
				}
				if (i <= 9) {
					String header = line.substring(skipToBlankLine[i].length());
					while ((line = br.readLine()) != null) {
						if (line.equals(""))
							break;
						if (line.startsWith(" ") || line.startsWith("\t"))
							header += line;
						else
							break;
					}
					StringTokenizer st = new StringTokenizer(header, " \t,");
					while (st.hasMoreTokens()) {
						String token = st.nextToken();
						if (!token.endsWith("@enron.com")) {
							continue;
						}
						token = token.substring(0, token.length() - 10);
						int dot = token.indexOf(".");
						if (dot == -1) {
							continue;
						}
						if (dot != token.lastIndexOf(".")) {
							if (dot == token.lastIndexOf(".") - 1) {
								dot++;
								if (dot + 1 < token.length() - 1)
									headerPersonNames.add(token
											.substring(dot + 1));
							}
							continue;
						}
						if (dot > 1)
							headerPersonNames.add(token.substring(0, dot));
						if (dot + 1 < token.length() - 1)
							headerPersonNames.add(token.substring(dot + 1));
					}
				} else
					line = br.readLine();
			}

			while ((line = br.readLine()) != null) {
				boolean header = false;
				for (int i = 0; i < skip.length; i++) {
					int index = line.toLowerCase().indexOf(skip[i]);
					if (index != -1) {
						String prefix = line.substring(0, index).trim();
						header = true;
						for (int j = 0; j < prefix.length(); j++) {
							if (prefix.charAt(j) != '-'
									&& prefix.charAt(j) != '>'
									&& prefix.charAt(j) != ' ') {
								header = false;
								break;
							}
						}
						if (header)
							break;
					}
				}
				if (header)
					continue;

				for (int i = 0; i < skipToBlankLine.length; i++) {
					int index = line.toLowerCase().indexOf(skipToBlankLine[i]);
					if (index != -1) {
						String prefix = line.substring(0, index).trim();
						header = true;
						for (int j = 0; j < prefix.length(); j++) {
							if (prefix.charAt(j) != '-'
									&& prefix.charAt(j) != '>'
									&& prefix.charAt(j) != ' ') {
								header = false;
								break;
							}
						}
						if (header)
							break;
					}
				}
				if (header) {
					while ((line = br.readLine()) != null) {
						if (line.equals(""))
							break;
					}
					continue;
				}

				message.append(line);
				message.append("\n");
			}
		} catch (IOException e) {
			System.err.println(e);
		}

		String currentLabel = "O";
		StringTokenizer st = new StringTokenizer(message.toString(), "<>", true);
		boolean readText = true;
		String text = null;
		while (st.hasMoreTokens()) {
			if (readText)
				text = st.nextToken();
			readText = true;

			if (text.equals("<")) {
				String tag = st.nextToken();
				if (tag.equals("/ENAMEX") || tag.equals("/TIMEX")
						|| tag.equals("/NUMEX")) {
					String nextToken = st.nextToken();
					assert (nextToken.equals(">"));
					currentLabel = "O";
					continue;
				} else if (tag.startsWith("ENAMEX") || tag.startsWith("TIMEX")
						|| tag.startsWith("NUMEX")) {
					String type = tag.substring(tag.indexOf(" ") + 1);
					assert (type.startsWith("TYPE="));
					type = type.substring(type.indexOf("\"") + 1,
							type.lastIndexOf("\""));
					// nested entities (should do something)
					// if (!currentLabel.equals("O")) {
					// }

					for (int i = 0; i < labels.length; i++) {
						if (labels[i].equals(type)) {
							currentLabel = "B-" + type;
							break;
						}
					}
					String nextToken = st.nextToken();
					assert (nextToken.equals(">"));
					continue;
				} else {// false alarm
					data.add(new Token("<"));
					target.add(currentLabel);
					if (saveSource) {
						source.append("<");
						source.append("\n");
					}
					text = tag;
					readText = false;
				}
			}

			// there is no tag in "text"
			StringTokenizer wordst = new StringTokenizer(text,
					"~`!@#$%^&*()_-+={[}]|\\:;\"',<.>?/ \t\n\r", true);
			while (wordst.hasMoreTokens()) {
				String word = wordst.nextToken();
				if (word.equals(" ") || word.equals("\t") || word.equals("\n")
						|| word.equals("\r"))
					continue;
				String originalWord = word;
				Token token = wt.transformedToken(word);
				// Check if the token is in headerPersonNames
				if (headerPersonNames.contains(word.toLowerCase())) {
					token.setFeatureValue("HEADER-PERSON", 1.0);
				}

				// Append
				data.add(token);
				target.add(currentLabel);
				if (saveSource) {
					source.append(originalWord);
					source.append("\n");
				}

				if (currentLabel.startsWith("B-"))
					currentLabel = "I-" + currentLabel.substring(2);
			}
		}

		carrier.setData(data);
		carrier.setTarget(target);
		if (saveSource)
			carrier.setSource(source);
		return carrier;
	}

	public void write(File f) {
		try {
			ObjectOutputStream oos = new ObjectOutputStream(
					new FileOutputStream(f));
			oos.writeObject(headerPersonNames);
			oos.close();
		} catch (IOException e) {
			System.err.println("Exception writing file " + f + ": " + e);
		}
	}

	// Serialization

	private static final long serialVersionUID = 1;
	private static final int CURRENT_SERIAL_VERSION = 0;

	private void writeObject(ObjectOutputStream out) throws IOException {
		out.writeInt(CURRENT_SERIAL_VERSION);
		out.writeBoolean(saveSource);
		out.writeObject(headerPersonNames);
	}

	private void readObject(ObjectInputStream in) throws IOException,
			ClassNotFoundException {
		int version = in.readInt();
		saveSource = in.readBoolean();
		headerPersonNames = (HashSet) in.readObject();
	}
}
